In [1]:
import matplotlib.pyplot as plt
import plotly as ply
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import missingno
In [2]:
df = pd.read_csv("../data/application_data.csv")
df
Out[2]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
1 100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3 100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
4 100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
307506 456251 0 Cash loans M N N 0 157500.0 254700.0 27558.0 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
307507 456252 0 Cash loans F N Y 0 72000.0 269550.0 12001.5 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
307508 456253 0 Cash loans F N Y 0 153000.0 677664.0 29979.0 ... 0 0 0 0 1.0 0.0 0.0 1.0 0.0 1.0
307509 456254 1 Cash loans F N Y 0 171000.0 370107.0 20205.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
307510 456255 0 Cash loans F N N 0 157500.0 675000.0 49117.5 ... 0 0 0 0 0.0 0.0 0.0 2.0 0.0 1.0

307511 rows × 122 columns

In [3]:
data_descriptions = pd.read_csv("../data/columns_description.csv", encoding='latin1')
data_descriptions.head()
Out[3]:
Unnamed: 0 Table Row Description Special
0 1 application_data SK_ID_CURR ID of loan in our sample NaN
1 2 application_data TARGET Target variable (1 - client with payment diffi... NaN
2 5 application_data NAME_CONTRACT_TYPE Identification if loan is cash or revolving NaN
3 6 application_data CODE_GENDER Gender of the client NaN
4 7 application_data FLAG_OWN_CAR Flag if the client owns a car NaN
In [4]:
my_vars = pd.read_csv("../data/my_vars.csv")
my_vars_vec = my_vars['Row'].to_list()
In [5]:
my_df = df[my_vars_vec].copy()
my_df.head()
Out[5]:
AMT_ANNUITY AMT_GOODS_PRICE OWN_CAR_AGE CNT_FAM_MEMBERS EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG
0 24700.5 351000.0 NaN 1.0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833
1 35698.5 1129500.0 NaN 2.0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917
2 6750.0 135000.0 26.0 1.0 NaN 0.555912 0.729567 NaN NaN NaN NaN NaN NaN NaN NaN
3 29686.5 297000.0 NaN 2.0 NaN 0.650442 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 21865.5 513000.0 NaN 1.0 NaN 0.322738 NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [6]:
aux = my_vars[my_vars['Category'] == 1]
my_vars_vec = aux['Row'].to_list()
first_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 2]
my_vars_vec = aux['Row'].to_list()
second_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 3]
my_vars_vec = aux['Row'].to_list()
third_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 4]
my_vars_vec = aux['Row'].to_list()
fourth_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 5]
my_vars_vec = aux['Row'].to_list()
fifth_df = df[my_vars_vec].copy()
In [7]:
%matplotlib inline
missingno.bar(df, color=(63/255,132/255,172/255))
Out[7]:
<AxesSubplot: >
In [8]:
%matplotlib inline
missingno.matrix(df, color=(63/255,132/255,172/255))
Out[8]:
<AxesSubplot: >
In [9]:
%matplotlib inline
missingno.heatmap(df)
Out[9]:
<AxesSubplot: >
In [10]:
%matplotlib inline
missingno.heatmap(df[['AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_YEAR','EXT_SOURCE_3']])
Out[10]:
<AxesSubplot: >
In [11]:
%matplotlib inline
missingno.heatmap(df[['AMT_ANNUITY','AMT_GOODS_PRICE','NAME_TYPE_SUITE','OWN_CAR_AGE','OCCUPATION_TYPE','CNT_FAM_MEMBERS','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']])
Out[11]:
<AxesSubplot: >
In [12]:
%matplotlib inline
missingno.heatmap(df[['OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE']])
Out[12]:
<AxesSubplot: >
In [13]:
%matplotlib inline
missingno.bar(my_df, color=(63/255,132/255,172/255))
Out[13]:
<AxesSubplot: >
In [14]:
%matplotlib inline
missingno.matrix(my_df, color=(63/255,132/255,172/255))
Out[14]:
<AxesSubplot: >
In [15]:
%matplotlib inline
missingno.heatmap(my_df)
Out[15]:
<AxesSubplot: >
In [16]:
%matplotlib inline
missingno.bar(first_df, color=(63/255,132/255,172/255))
Out[16]:
<AxesSubplot: >
In [17]:
%matplotlib inline
missingno.matrix(first_df, color=(63/255,132/255,172/255))
Out[17]:
<AxesSubplot: >
In [18]:
print(data_descriptions[data_descriptions['Row'] == 'AMT_ANNUITY']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'AMT_ANNUITY']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'AMT_GOODS_PRICE']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'AMT_GOODS_PRICE']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'NAME_TYPE_SUITE']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'NAME_TYPE_SUITE']['Special'].to_list())
['Loan annuity']
[nan]
['For consumer loans it is the price of the goods for which the loan is given']
[nan]
['Who was accompanying client when he was applying for the loan']
[nan]
In [19]:
df_helper = df[['AMT_ANNUITY']].copy()
sns.displot(data=df_helper, x="AMT_ANNUITY", kde=True)
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x7efcb0921db0>
In [20]:
df_helper = df[['AMT_GOODS_PRICE']].copy()
sns.displot(data=df_helper, x="AMT_GOODS_PRICE", kde=False)
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x7efcb016b970>
In [21]:
df_helper = df[['NAME_TYPE_SUITE']].copy()
df_helper.drop_duplicates(inplace=True)
df_helper
Out[21]:
NAME_TYPE_SUITE
0 Unaccompanied
1 Family
5 Spouse, partner
8 Children
18 Other_A
64 NaN
174 Other_B
416 Group of people
In [22]:
pd.set_option('display.max_columns', None)
df[df['AMT_ANNUITY'].isna()]
Out[22]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
47531 155054 0 Cash loans M N N 0 180000.0 450000.0 NaN 450000.0 Unaccompanied Commercial associate Incomplete higher Single / not married House / apartment 0.026392 -10668 -2523 -4946.0 -3238 NaN 1 1 1 1 1 0 High skill tech staff 1.0 2 2 WEDNESDAY 13 0 0 0 0 0 0 Business Entity Type 3 NaN 0.626896 0.372334 0.0124 NaN 0.9662 NaN NaN 0.00 0.0345 0.0417 NaN NaN NaN 0.0115 NaN 0.0000 0.0126 NaN 0.9662 NaN NaN 0.0000 0.0345 0.0417 NaN NaN NaN 0.0119 NaN 0.0000 0.0125 NaN 0.9662 NaN NaN 0.00 0.0345 0.0417 NaN NaN NaN 0.0117 NaN 0.0000 NaN block of flats 0.0090 Stone, brick No 1.0 0.0 1.0 0.0 -2.0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 1.0
50035 157917 0 Cash loans F N N 0 94500.0 450000.0 NaN 450000.0 Unaccompanied Working Lower secondary Civil marriage House / apartment 0.035792 -9027 -1270 -3640.0 -741 NaN 1 1 1 1 0 0 Laborers 2.0 2 2 MONDAY 20 0 0 0 0 0 0 Business Entity Type 1 NaN 0.727274 0.468660 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -706.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 1.0 0.0 1.0
51594 159744 0 Cash loans F N N 0 202500.0 539100.0 NaN 450000.0 Unaccompanied Working Secondary / secondary special Married House / apartment 0.046220 -17376 -3950 -11524.0 -831 NaN 1 1 0 1 0 0 Cooking staff 2.0 1 1 WEDNESDAY 15 0 0 0 1 1 1 Self-employed 0.885980 0.738370 0.452534 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 0.0 1.0 0.0 -199.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 1.0 0.0 1.0
55025 163757 0 Cash loans F N N 0 162000.0 296280.0 NaN 225000.0 Unaccompanied State servant Higher education Married House / apartment 0.035792 -11329 -2040 -3195.0 -3069 NaN 1 1 0 1 0 0 Core staff 2.0 2 2 FRIDAY 13 0 0 0 1 1 1 Government 0.336803 0.566316 0.220095 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -2841.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 1.0 0.0 4.0
59934 169487 0 Cash loans M Y N 0 202500.0 360000.0 NaN 360000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.046220 -19762 -2498 -11285.0 -3305 4.0 1 1 1 1 1 0 Laborers 1.0 1 1 FRIDAY 15 0 1 1 0 1 1 Other 0.647350 0.583947 0.177704 0.0856 NaN 0.9841 NaN NaN 0.08 0.0345 0.4583 NaN NaN NaN 0.0548 NaN 0.1139 0.0872 NaN 0.9841 NaN NaN 0.0806 0.0345 0.4583 NaN NaN NaN 0.0571 NaN 0.1206 0.0864 NaN 0.9841 NaN NaN 0.08 0.0345 0.4583 NaN NaN NaN 0.0558 NaN 0.1163 NaN block of flats 0.0679 Stone, brick No 0.0 0.0 0.0 0.0 -743.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 1.0 0.0 0.0 6.0
75873 187985 0 Cash loans M Y N 0 144000.0 219249.0 NaN 166500.0 Unaccompanied Working Higher education Single / not married Rented apartment 0.022800 -20831 -2450 -771.0 -4203 8.0 1 1 0 1 0 0 Drivers 1.0 2 2 FRIDAY 15 0 0 0 0 0 0 Business Entity Type 3 0.339389 0.667564 0.425893 0.0928 0.0719 0.9831 0.7688 0.0113 0.00 0.2069 0.1667 0.0417 0.6932 0.0748 0.0865 0.0039 0.0031 0.0945 0.0746 0.9831 0.7779 0.0114 0.0000 0.2069 0.1667 0.0417 0.7091 0.0817 0.0901 0.0039 0.0033 0.0937 0.0719 0.9831 0.7719 0.0113 0.00 0.2069 0.1667 0.0417 0.7053 0.0761 0.0880 0.0039 0.0032 not specified block of flats 0.0687 Panel No 0.0 0.0 0.0 0.0 -1986.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 11.0
89343 203726 0 Cash loans F Y N 0 90000.0 157500.0 NaN 157500.0 Unaccompanied State servant Secondary / secondary special Married House / apartment 0.015221 -12134 -3721 -858.0 -591 2.0 1 1 0 1 1 0 Medicine staff 2.0 2 2 WEDNESDAY 11 0 0 0 0 0 0 Medicine 0.682126 0.154811 0.297087 0.1227 NaN 0.9781 NaN NaN 0.00 0.2759 0.1667 NaN NaN NaN 0.1131 NaN 0.0044 0.1250 NaN 0.9782 NaN NaN 0.0000 0.2759 0.1667 NaN NaN NaN 0.1178 NaN 0.0046 0.1239 NaN 0.9781 NaN NaN 0.00 0.2759 0.1667 NaN NaN NaN 0.1151 NaN 0.0045 NaN block of flats 0.1218 Panel No 0.0 0.0 0.0 0.0 -348.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 2.0 0.0 0.0
123872 243648 0 Cash loans F N Y 0 202500.0 929088.0 NaN 720000.0 Unaccompanied Working Secondary / secondary special Civil marriage House / apartment 0.019689 -13902 -3540 -168.0 -4250 NaN 1 1 0 1 0 1 Secretaries 2.0 2 2 SATURDAY 12 0 0 0 0 0 0 Self-employed 0.400723 0.510934 0.581484 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -1331.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 5.0
207186 340147 0 Cash loans M N N 0 171000.0 486000.0 NaN 486000.0 Unaccompanied Commercial associate Higher education Married House / apartment 0.018634 -10151 -472 -10127.0 -2787 NaN 1 1 0 1 1 0 Security staff 2.0 2 2 WEDNESDAY 13 1 1 1 1 1 1 Security NaN 0.706306 0.391055 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -295.0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 1.0 1.0 0.0 2.0
227939 364022 0 Cash loans F N Y 0 315000.0 628069.5 NaN 499500.0 Unaccompanied Commercial associate Higher education Married Municipal apartment 0.046220 -16344 -1478 -482.0 -4192 NaN 1 1 0 1 0 0 Sales staff 2.0 1 1 WEDNESDAY 11 0 1 1 0 0 0 Business Entity Type 3 NaN 0.618698 0.431192 0.1918 0.1735 0.9866 0.8164 0.0464 0.12 0.1034 0.3333 0.0417 0.1784 0.1563 0.1728 0.0000 0.0000 0.1954 0.1800 0.9866 0.8236 0.0468 0.1208 0.1034 0.3333 0.0417 0.1825 0.1708 0.1800 0.0000 0.0000 0.1936 0.1735 0.9866 0.8189 0.0467 0.12 0.1034 0.3333 0.0417 0.1815 0.1590 0.1759 0.0000 0.0000 reg oper account block of flats 0.1613 Stone, brick No 1.0 0.0 1.0 0.0 -1255.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 1.0 0.0 0.0 2.0
239329 377174 0 Cash loans F N Y 0 157500.0 792000.0 NaN 792000.0 Family State servant Secondary / secondary special Married House / apartment 0.026392 -17661 -2885 -8594.0 -1221 NaN 1 1 0 1 1 0 Managers 2.0 2 2 SATURDAY 14 0 0 0 0 0 0 Other 0.775404 0.656305 0.651260 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -322.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 1.0
241835 379997 0 Cash loans F N N 0 315000.0 1483231.5 NaN 1354500.0 Unaccompanied Working Higher education Married House / apartment 0.072508 -15072 -152 -7576.0 -4389 NaN 1 1 1 1 1 0 Accountants 2.0 1 1 WEDNESDAY 15 0 1 1 0 1 1 Self-employed 0.593394 0.267269 NaN 0.3691 0.1891 0.9806 0.7348 0.2590 0.52 0.2241 0.5417 0.5833 0.0000 0.3009 0.3756 0.0000 0.2002 0.2216 0.1435 0.9796 0.7321 0.1590 0.3222 0.1379 0.5417 0.5833 0.0000 0.1938 0.2268 0.0000 0.0720 0.3726 0.1891 0.9806 0.7383 0.2607 0.52 0.2241 0.5417 0.5833 0.0000 0.3061 0.3824 0.0000 0.2044 reg oper spec account block of flats 0.1860 Stone, brick No 0.0 0.0 0.0 0.0 -504.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN
In [23]:
pd.reset_option('max_columns')

Ci sono 12 entrate che hanno AMT_ANNUITY a NaN, sono numeri, e non sembra chiara una scelta da sostituire al NaN, soprattutto in vista della forma dell'istogramma, non riesco neanche a immaginarmi spiegazioni sensate del perché questo dato possa mancare se non errore di inserimento. Non riesco a trovare colonne del dataset che sembrano essere collegate alla mancanza di questo dato.

In [24]:
df_helper = df[df['AMT_GOODS_PRICE'].isna()].copy()
df_helper = df_helper[['AMT_GOODS_PRICE', 'NAME_TYPE_SUITE']].drop_duplicates()
df_helper
Out[24]:
AMT_GOODS_PRICE NAME_TYPE_SUITE
724 NaN NaN
In [25]:
df_helper = df[df['AMT_GOODS_PRICE'].isna()].copy()
df_helper = df_helper[['AMT_GOODS_PRICE', 'NAME_TYPE_SUITE']]
df_helper
Out[25]:
AMT_GOODS_PRICE NAME_TYPE_SUITE
724 NaN NaN
5937 NaN NaN
6425 NaN NaN
6703 NaN NaN
7647 NaN NaN
... ... ...
304678 NaN NaN
304784 NaN NaN
305833 NaN NaN
306126 NaN NaN
306273 NaN NaN

278 rows × 2 columns

In [26]:
df_helper = df[df['NAME_TYPE_SUITE'].isna()].copy()
df_helper = df_helper[['AMT_GOODS_PRICE', 'NAME_TYPE_SUITE']]
df_helper
Out[26]:
AMT_GOODS_PRICE NAME_TYPE_SUITE
64 315000.0 NaN
724 NaN NaN
1199 477000.0 NaN
1669 832500.0 NaN
1862 243000.0 NaN
... ... ...
306273 NaN NaN
306544 894825.0 NaN
306615 225000.0 NaN
306636 585000.0 NaN
306725 675000.0 NaN

1292 rows × 2 columns

In tutte le entrate di AMT_GOODS_PRICE che contengono un NaN (278 righe) abbiamo che non è segnato neanche la persona accompagnatrice, che però è un dato mancante in 1292 righe. Infatti il viceversa non vale. Non sembra chiaro il perché ci possa essere un collegamento tra le due cose, ma era già evidenziato dall'heatmap

In [27]:
%matplotlib inline
missingno.bar(second_df, color=(63/255,132/255,172/255))
Out[27]:
<AxesSubplot: >
In [28]:
%matplotlib inline
second_df['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].copy()
second_df['FLAG_OWN_CAR'].replace(to_replace='N',value=np.nan, inplace=True)
missingno.matrix(second_df, color=(63/255,132/255,172/255))
Out[28]:
<AxesSubplot: >
In [29]:
df_helper = df[['OWN_CAR_AGE']].copy()
sns.displot(data=df_helper, x="OWN_CAR_AGE", kde=True)
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x7efcb041c460>
In [30]:
df_helper = df[['DAYS_BIRTH']].copy()
df_helper['YEARS_BIRTH_POSITIVE'] = df_helper.apply(lambda vec : np.abs(vec[:])/365)
sns.displot(data=df_helper, x="YEARS_BIRTH_POSITIVE", kde=True)
Out[30]:
<seaborn.axisgrid.FacetGrid at 0x7efcb02dfd00>
In [31]:
print(data_descriptions[data_descriptions['Row'] == 'OWN_CAR_AGE']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'OWN_CAR_AGE']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLAG_OWN_CAR']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLAG_OWN_CAR']['Special'].to_list())
["Age of client's car"]
[nan]
['Flag if the client owns a car']
[nan]
In [32]:
helper_df = df[df['OWN_CAR_AGE'].isna()].copy()
print(len(helper_df))
helper_df = helper_df[['FLAG_OWN_CAR','OWN_CAR_AGE']]
helper_df.drop_duplicates(inplace=True)
helper_df
202929
Out[32]:
FLAG_OWN_CAR OWN_CAR_AGE
0 N NaN
30897 Y NaN
In [33]:
helper_df = df[['FLAG_OWN_CAR','OWN_CAR_AGE']].copy()
helper_df = helper_df[(helper_df['FLAG_OWN_CAR'] == 'Y') & (helper_df['OWN_CAR_AGE'].isna())]
helper_df
Out[33]:
FLAG_OWN_CAR OWN_CAR_AGE
30897 Y NaN
181231 Y NaN
217549 Y NaN
229867 Y NaN
236868 Y NaN
In [34]:
helper_df = df[['FLAG_OWN_CAR','OWN_CAR_AGE']].copy()
helper_df = helper_df[(helper_df['FLAG_OWN_CAR'] == 'N') & (helper_df['OWN_CAR_AGE'].notna())]
helper_df
Out[34]:
FLAG_OWN_CAR OWN_CAR_AGE

Per quanto riguarda OWN_CAR_AGE, abbiamo 202929 NaN che corrispondono quasi interamente a casi in cui FLAG_OWN_CAR è posto a N, quindi persone che non hanno una automobile. Tuttavia in 5 casi risulta FLAG_OWN_CAR = Y e comunque OWN_CAR_AGE = None, questi sono casi in cui effettivamente il valore manca. Il caso contrario, ovvero FLAG_OWN_CAR pari a N e OWN_CAR_AGE non nullo, invece, non capita mai.

In [35]:
%matplotlib inline
missingno.bar(third_df, color=(63/255,132/255,172/255))
Out[35]:
<AxesSubplot: >
In [36]:
%matplotlib inline
missingno.matrix(third_df, color=(63/255,132/255,172/255))
Out[36]:
<AxesSubplot: >
In [37]:
print(data_descriptions[data_descriptions['Row'] == 'CNT_FAM_MEMBERS']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'CNT_FAM_MEMBERS']['Special'].to_list())
['How many family members does client have']
[nan]
In [38]:
df[df['CNT_FAM_MEMBERS'].isna()]
Out[38]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
41982 148605 0 Revolving loans M N Y 0 450000.0 675000.0 33750.0 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
187348 317181 0 Revolving loans F N Y 0 202500.0 585000.0 29250.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0

2 rows × 122 columns

In [39]:
df[df['CNT_FAM_MEMBERS'] == 0]
Out[39]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR

0 rows × 122 columns

In [40]:
df[df['CNT_FAM_MEMBERS'] == 1]
Out[40]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
4 100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
9 100012 0 Revolving loans M N Y 0 135000.0 405000.0 20250.0 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
15 100019 0 Cash loans M Y Y 0 157500.0 299772.0 20160.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
307504 456248 0 Cash loans F N Y 0 153000.0 331920.0 16096.5 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
307505 456249 0 Cash loans F N Y 0 112500.0 225000.0 22050.0 ... 0 0 0 0 0.0 0.0 0.0 2.0 0.0 0.0
307506 456251 0 Cash loans M N N 0 157500.0 254700.0 27558.0 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
307507 456252 0 Cash loans F N Y 0 72000.0 269550.0 12001.5 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
307508 456253 0 Cash loans F N Y 0 153000.0 677664.0 29979.0 ... 0 0 0 0 1.0 0.0 0.0 1.0 0.0 1.0

67847 rows × 122 columns

Ci sono esattamente due casi in cui CNT_FAM_MEMBERS risulta pari a NaN, sembrano dei chiari errori di inserimento

In [41]:
%matplotlib inline
missingno.bar(fourth_df, color=(63/255,132/255,172/255))
Out[41]:
<AxesSubplot: >
In [42]:
%matplotlib inline
missingno.matrix(fourth_df, color=(63/255,132/255,172/255))
Out[42]:
<AxesSubplot: >
In [43]:
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_1']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_1']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_2']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_2']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_3']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_3']['Special'].to_list())
['Normalized score from external data source']
['normalized']
['Normalized score from external data source']
['normalized']
['Normalized score from external data source']
['normalized']
In [44]:
sns.pairplot(df[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']], markers='*')
Out[44]:
<seaborn.axisgrid.PairGrid at 0x7efcabd53b20>
In [45]:
print(len(df[df['EXT_SOURCE_1'].isna()]))
print(len(df[df['EXT_SOURCE_2'].isna()]))
print(len(df[df['EXT_SOURCE_3'].isna()]))
print(len(df[df['EXT_SOURCE_3'] <= 0.0005272653]))
print(len(df[df['EXT_SOURCE_3'] <= 0.0005272652]))
173378
660
60965
886
0
In [46]:
df[df['EXT_SOURCE_3'] <= 0.0005272653][['EXT_SOURCE_3']]
Out[46]:
EXT_SOURCE_3
374 0.000527
970 0.000527
991 0.000527
1152 0.000527
1283 0.000527
... ...
305434 0.000527
305726 0.000527
306408 0.000527
306430 0.000527
306557 0.000527

886 rows × 1 columns

Purtroppo non c'è nessuna spiegazione di questi dati, che contengono un sacco di entrate pari a NaN. Inoltre EXT_SOURCE_3 ha 886 sospette entrate molto vicine a 0

In [47]:
%matplotlib inline
missingno.bar(fifth_df, color=(63/255,132/255,172/255))
Out[47]:
<AxesSubplot: >
In [48]:
%matplotlib inline
missingno.matrix(fifth_df, color=(63/255,132/255,172/255))
Out[48]:
<AxesSubplot: >
In [49]:
%matplotlib inline
missingno.heatmap(fifth_df)
Out[49]:
<AxesSubplot: >
In [50]:
aux = my_vars[my_vars['Category'] == 5]
my_vars_vec = aux['Row'].to_list()
sns.histplot(df[my_vars_vec[0]])
Out[50]:
<AxesSubplot: xlabel='APARTMENTS_AVG', ylabel='Count'>
In [51]:
sns.histplot(df[my_vars_vec[1]])
Out[51]:
<AxesSubplot: xlabel='BASEMENTAREA_AVG', ylabel='Count'>
In [52]:
sns.histplot(df[my_vars_vec[2]])
Out[52]:
<AxesSubplot: xlabel='YEARS_BEGINEXPLUATATION_AVG', ylabel='Count'>
In [53]:
sns.histplot(df[my_vars_vec[3]])
Out[53]:
<AxesSubplot: xlabel='YEARS_BUILD_AVG', ylabel='Count'>
In [54]:
sns.histplot(df[my_vars_vec[4]])
Out[54]:
<AxesSubplot: xlabel='COMMONAREA_AVG', ylabel='Count'>
In [55]:
sns.histplot(df[my_vars_vec[5]])
Out[55]:
<AxesSubplot: xlabel='ELEVATORS_AVG', ylabel='Count'>
In [56]:
sns.histplot(df[my_vars_vec[6]])
Out[56]:
<AxesSubplot: xlabel='ENTRANCES_AVG', ylabel='Count'>
In [57]:
sns.histplot(df[my_vars_vec[7]])
Out[57]:
<AxesSubplot: xlabel='FLOORSMAX_AVG', ylabel='Count'>
In [58]:
print(data_descriptions[data_descriptions['Row'] == 'APARTMENTS_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'APARTMENTS_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'BASEMENTAREA_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'BASEMENTAREA_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BEGINEXPLUATATION_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BEGINEXPLUATATION_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BUILD_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BUILD_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'COMMONAREA_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'COMMONAREA_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ELEVATORS_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ELEVATORS_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ENTRANCES_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ENTRANCES_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLOORSMAX_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLOORSMAX_AVG']['Special'].to_list())
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor']
['normalized']
In [59]:
sixth_df = fifth_df.copy()
sixth_df['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].copy()
sixth_df['FLAG_OWN_REALTY'].replace(to_replace='N',value=np.nan, inplace=True)
In [60]:
%matplotlib inline
missingno.bar(sixth_df, color=(63/255,132/255,172/255))
Out[60]:
<AxesSubplot: >
In [61]:
%matplotlib inline
missingno.matrix(sixth_df, color=(63/255,132/255,172/255))
Out[61]:
<AxesSubplot: >
In [62]:
%matplotlib inline
missingno.heatmap(sixth_df)
Out[62]:
<AxesSubplot: >

A quanto pare non c'è legame tra FLAG_OWN_REALTY e queste variabili relative al luogo di abitazione

In [63]:
sixth_df[sixth_df['FLAG_OWN_REALTY'].isna()]
Out[63]:
APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLAG_OWN_REALTY
1 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 NaN
13 0.1474 0.0973 0.9806 0.7348 0.0582 0.16 0.1379 0.3333 NaN
16 NaN NaN NaN NaN NaN NaN NaN NaN NaN
22 0.0907 0.0795 0.9786 0.7076 0.0120 0.00 0.2069 0.1667 NaN
24 0.1433 0.1455 0.9861 0.8096 0.0212 0.00 0.3103 0.1667 NaN
... ... ... ... ... ... ... ... ... ...
307494 NaN NaN NaN NaN NaN NaN NaN NaN NaN
307495 0.0742 NaN 0.9871 NaN 0.0136 0.08 0.0690 0.3333 NaN
307496 NaN NaN NaN NaN NaN NaN NaN NaN NaN
307506 0.2021 0.0887 0.9876 0.8300 0.0202 0.22 0.1034 0.6042 NaN
307510 0.0742 0.0526 0.9881 NaN 0.0176 0.08 0.0690 0.3750 NaN

94199 rows × 9 columns

Boh provo intanto a togliere tutte le righe in cui sono presenti tutti NaN relativi alle colonne relative al luogo di abitazione

In [64]:
df_vars_abitazione = data_descriptions[(data_descriptions['Unnamed: 0'] >= 47) & (data_descriptions['Unnamed: 0'] <= 93)]
vars_abitazione = df_vars_abitazione['Row'].to_list()
In [65]:
df_abitazione = df[vars_abitazione]
print(len(df_abitazione[df_abitazione.isna().all(axis=1)]))
df_abitazione = df_abitazione[df_abitazione.notna().any(axis=1)]
aux = my_vars[my_vars['Category'] == 5]
my_vars_vec = aux['Row'].to_list()
seventh_df = df_abitazione[my_vars_vec].copy()
seventh_df
145755
Out[65]:
APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG
0 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833
1 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917
12 0.0825 NaN 0.9811 NaN NaN 0.00 0.2069 0.1667
13 0.1474 0.0973 0.9806 0.7348 0.0582 0.16 0.1379 0.3333
14 0.3495 0.1335 0.9985 0.9796 0.1143 0.40 0.1724 0.6667
... ... ... ... ... ... ... ... ...
307506 0.2021 0.0887 0.9876 0.8300 0.0202 0.22 0.1034 0.6042
307507 0.0247 0.0435 0.9727 0.6260 0.0022 0.00 0.1034 0.0833
307508 0.1031 0.0862 0.9816 0.7484 0.0123 0.00 0.2069 0.1667
307509 0.0124 NaN 0.9771 NaN NaN NaN 0.0690 0.0417
307510 0.0742 0.0526 0.9881 NaN 0.0176 0.08 0.0690 0.3750

161756 rows × 8 columns

Ci sono 145755 entrate in cui tutte le variabili relative all'abitazione di residenza (non solo quelle datemi dalla suddivisione) sono NaN

In [66]:
%matplotlib inline
missingno.bar(seventh_df, color=(63/255,132/255,172/255))
Out[66]:
<AxesSubplot: >
In [67]:
%matplotlib inline
missingno.matrix(seventh_df, color=(63/255,132/255,172/255))
Out[67]:
<AxesSubplot: >
In [68]:
%matplotlib inline
missingno.heatmap(seventh_df)
Out[68]:
<AxesSubplot: >

Francamente ha senso su questo fare i plot anche considerando tutte le altre variabili relative all'abitazione

In [69]:
df_vars_abitazione = data_descriptions[(data_descriptions['Unnamed: 0'] >= 47) & (data_descriptions['Unnamed: 0'] <= 93)]
vars_abitazione = df_vars_abitazione['Row'].to_list()
df_abitazione = df[vars_abitazione]
print(len(df_abitazione[df_abitazione.isna().all(axis=1)]))
df_abitazione = df_abitazione[df_abitazione.notna().any(axis=1)]
145755
In [70]:
%matplotlib inline
missingno.bar(df_abitazione, color=(63/255,132/255,172/255))
Out[70]:
<AxesSubplot: >
In [71]:
%matplotlib inline
missingno.matrix(df_abitazione, color=(63/255,132/255,172/255))
Out[71]:
<AxesSubplot: >
In [72]:
%matplotlib inline
missingno.heatmap(df_abitazione)
Out[72]:
<AxesSubplot: >

Vista la forte correlazione ha senso tenenere in considerazione unicamente le variabili AVG

In [73]:
df_vars_abitazione = data_descriptions[(data_descriptions['Unnamed: 0'] >= 47) & (data_descriptions['Unnamed: 0'] <= 60)]
vars_abitazione = df_vars_abitazione['Row'].to_list()
df_abitazione = df[vars_abitazione]
print(len(df_abitazione[df_abitazione.isna().all(axis=1)]))
df_abitazione = df_abitazione[df_abitazione.notna().any(axis=1)]
148810
In [74]:
%matplotlib inline
missingno.bar(df_abitazione, color=(63/255,132/255,172/255))
Out[74]:
<AxesSubplot: >
In [75]:
%matplotlib inline
missingno.matrix(df_abitazione, color=(63/255,132/255,172/255))
Out[75]:
<AxesSubplot: >
In [76]:
%matplotlib inline
missingno.heatmap(df_abitazione)
Out[76]:
<AxesSubplot: >